import sklearn
sklearn.__version__
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from statsmodels.formula.api import ols
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
import xgboost as xgb
from IPython.display import Image
from sklearn.model_selection import train_test_split
from IPython.display import Image
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import matplotlib.pyplot as plt
import plotly_express as px
import tqdm
from tqdm._tqdm_notebook import tqdm_notebook
from pygeocoder import Geocoder
import reverse_geocoder as rg
import pprint
pd.set_option('display.max_row', 100000)
pd.set_option('display.max_columns',500000)
dt_crime=pd.read_csv("crime.csv",encoding= 'unicode_escape')
dt_crime.head()
dt_crime.shape
dt_crime.columns
dt_crime.describe()
sns.pairplot(dt_crime)
plt.figure(figsize=(10,10))
sns.heatmap(dt_crime.corr(),annot=True,cmap='coolwarm')
dt_crime.dtypes
dt_crime.isnull().sum().sort_values(ascending=False)
dt_crime['SHOOTING'].value_counts().plot.bar()
dt_crime.SHOOTING.unique()
shooting_y=dt_crime[dt_crime['SHOOTING']=='Y']
print(len(shooting_y))
print('Total:=', len(dt_crime))
print('Shooting as yes:', len(shooting_y)/len(dt_crime)*100)
In Shooting column % of yes is 0.3 so we can drop this column
dt_crime.drop('SHOOTING',axis=1,inplace=True)
dt_crime['DISTRICT'].value_counts().plot.bar()
dt_crime['UCR_PART'].value_counts().plot.bar()
dt_crime['STREET'].value_counts().plot.bar()
I will write a function to find the mode for each column which has null values
Then I will use this function to impute the columns to remove null values
def impute_nan(df,variable):
most_freq_cat=df[variable].mode()[0]
df[variable].fillna(most_freq_cat,inplace=True)
impute_nan(dt_crime,'DISTRICT')
impute_nan(dt_crime,'UCR_PART')
impute_nan(dt_crime,'STREET')
I have removed the null values for 'DISTRICT','UCR_PART','STREET'
dt_crime.isnull().sum()
Now I will fill the null values of Lattitude and Longitude with mode
median_lat=dt_crime.Lat.median()
median_long=dt_crime.Long.median()
dt_crime['Lat'].fillna(median_lat,inplace=True)
dt_crime['Long'].fillna(median_long,inplace=True)
dt_crime.isnull().sum()
Now I will combine Lattitude and Longitude columns to create a new columns LatLong Column
And I will drop the location column
dt_crime['LatLong'] = list(zip(dt_crime.Lat, dt_crime.Long))
dt_crime.drop('Location',axis=1,inplace=True)
dt_crime.head()
Now I will derive new columns like city,state,county from Lat long column
def reverseGeocode(coordinates):
result = rg.search(coordinates)
return (result)
if __name__=="__main__":
# Coordinates tuple.Can contain more than one pair.
coordinates =list(zip(dt_crime['Lat'],dt_crime['Long'])) # generates pair of (lat,long)
data = reverseGeocode(coordinates)
dt_crime['name'] = [i['name'] for i in data]
dt_crime['admin1'] = [i['admin1'] for i in data]
dt_crime['admin2'] = [i['admin2'] for i in data]
dt_crime.to_csv("dt_crime.csv") # write to csv # result will be saved to data_appended.csv
dt_crime.rename(columns={'name':'City','admin1':'State','admin2':'County'},inplace=True)
dt_crime.head()
dt_crime.columns
dt_crime.City.unique()
dt_crime.OFFENSE_CODE_GROUP.unique()
Mapping year,month,city,OFFENSE_CODE_GROUP using seaborn
sns.set(style='darkgrid', font_scale=1.0)
sns.catplot(x='YEAR',
y='MONTH',hue='City',
col='OFFENSE_CODE_GROUP',
col_wrap=2,
data=dt_crime,
kind='bar')
import folium
dt_crime.shape
Now I will plot the lattitude and Longitude with city and OFFENSE_CODE_GROUP using google map
As we can see the shape of the dataset is 319073. So we will plot first 50 rows and last 50 rows
import folium
dt_1=dt_crime.head(50)
latitude = 42.30682138
longitude = -71.06030035
traffic_map = folium.Map(location=[latitude, longitude], zoom_start=5)
colordict = {0: 'lightblue', 1: 'lightgreen', 2: 'orange', 3: 'red'}
for lat, lon, city,OFFENSE_CODE_GROUP in zip(dt_1['Lat'], dt_1['Long'], dt_1['City'], dt_1['OFFENSE_CODE_GROUP']):
folium.CircleMarker(
[lat, lon],
popup = ('City: ' + str(city).capitalize() + '<br>'
'OFFENSE_CODE_GROUP: ' + str(OFFENSE_CODE_GROUP)
),
color='b',
fill=True,
fill_opacity=0.7
).add_to(traffic_map)
display(traffic_map)
dt_2=dt_crime.tail(50)
for lat, lon, city,OFFENSE_CODE_GROUP in zip(dt_2['Lat'], dt_2['Long'], dt_2['City'], dt_2['OFFENSE_CODE_GROUP']):
folium.CircleMarker(
[lat, lon],
popup = ('City: ' + str(city).capitalize() + '<br>'
'OFFENSE_CODE_GROUP: ' + str(OFFENSE_CODE_GROUP)
),
color='b',
fill=True,
fill_opacity=0.7
).add_to(traffic_map)
display(traffic_map)
dt = dt_crime.groupby(['YEAR','City','Lat','Long','MONTH',])['OFFENSE_CODE_GROUP'].count()
def histogram(data,path,color,title,xaxis,yaxis):
fig = px.histogram(data, x=path,color=color)
fig.update_layout(
title_text=title,
xaxis_title_text=xaxis,
yaxis_title_text=yaxis,
bargap=0.2,
bargroupgap=0.1
)
fig.show()
histogram(dt_crime,"OFFENSE_CODE_GROUP","OFFENSE_CODE_GROUP",'Major Crimes in Boston','Crime','Count')
Number_crimes = dt_crime['OFFENSE_CODE_GROUP'].value_counts()
values = Number_crimes.values
categories = pd.DataFrame(data=Number_crimes.index, columns=["OFFENSE_CODE_GROUP"])
categories['values'] = values
def treemap(categories,title,path,values):
fig = px.treemap(categories, path=path, values=values, height=700,
title=title, color_discrete_sequence = px.colors.sequential.RdBu)
fig.data[0].textinfo = 'label+text+value'
fig.show()
treemap(categories,'Major Crimes in Boston',['OFFENSE_CODE_GROUP'],categories['values'])
def bar(categories,x,y,color,title,xlab,ylab):
fig = px.bar(categories, x=x, y=y,
color=color,
height=400)
fig.update_layout(
title_text=title,
xaxis_title_text=xlab,
yaxis_title_text=ylab,
bargap=0.2,
bargroupgap=0.1
)
fig.show()
bar(categories,categories['OFFENSE_CODE_GROUP'][0:10],categories['values'][0:10]
,categories['OFFENSE_CODE_GROUP'][0:10],'Top 10 Major Crimes in Boston','Crime','Count')
Plotting Number of Crime Per year
Number_crimes_year = dt_crime['YEAR'].value_counts()
years = pd.DataFrame(data=Number_crimes_year.index, columns=["YEAR"])
years['values'] = Number_crimes_year.values
import plotly.express as px
fig = px.pie(years, values='values', names='YEAR', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
import plotly.graph_objects as go
Number_crimes_month = dt_crime['MONTH'].value_counts()
months = pd.DataFrame(data=Number_crimes_month.index, columns=["MONTH"])
months['values'] = Number_crimes_month.values
fig = go.Figure(go.Bar(
x=months['values'],
y=months['MONTH'],
marker=dict(
color='rgb(13,143,129)',
),
orientation='h'))
fig.update_layout(
title_text='Crimes in Boston as per month',
xaxis_title_text='Count',
yaxis_title_text='Month',
bargap=0.2,
bargroupgap=0.1
)
fig.show()
Number_crimes_days = dt_crime['DAY_OF_WEEK'].value_counts()
days = pd.DataFrame(data=Number_crimes_days.index, columns=["DAY_OF_WEEK"])
days['values'] = Number_crimes_days.values
fig = px.histogram(dt_crime, y="DAY_OF_WEEK",color="DAY_OF_WEEK")
fig.update_layout(
title_text='Crime count per day',
xaxis_title_text='Day',
yaxis_title_text='Crimes Count',
bargap=0.2,
bargroupgap=0.1
)
fig.show()
fig = go.Figure(data=[go.Pie(labels=days['DAY_OF_WEEK'], values=days['values'], hole=.4)])
fig.update_layout(
title_text='Crime count on each day',
)
fig.show()
histogram(dt_crime,"HOUR","HOUR",'Crime count on each Hour','Hour','Count')
histogram(dt_crime,"YEAR","YEAR",'Crime count on each Hour','Hour','Count')
histogram(dt_crime,"YEAR","MONTH",'Crime count on each year per month','Year','Crimes Count')
Number_crimes_city = dt_crime['City'].value_counts()
city = pd.DataFrame(data=Number_crimes_city.index, columns=["City"])
city['values'] = Number_crimes_city.values
bar(city,city['City'][0:10],city['values'][0:10]
,city['City'][0:10],'Top 10 Crime count on each City','City',' Crime Count')
histogram(dt_crime,"OFFENSE_CODE_GROUP","YEAR",'Crime count per Category on each Year','Category','Crimes Count on each Year')
histogram(dt_crime,"OFFENSE_CODE_GROUP","MONTH",'Crime count per Category on each Month','Category','Crimes Count on each Month')